Part 1 Classification

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import qgrid  # for interactive dataframe view
from pycaret.classification import *

sns.set_style("ticks")
In [2]:
# custom function define

## for EDA - Binning | Counts | Ratio : Now this is not required function, Look Sweetviz results~!
def ratio_plot(dataset0,Y_Cols):
    cols_float = dataset0.dtypes[dataset0.dtypes == float].index
    cols_object = dataset0.dtypes[dataset0.dtypes == object].index
    ratio_df_list = []
    dataset_bin = dataset0.copy()
    for col_i in dataset0.columns[:-1]:
        if col_i in cols_float:
            dataset_bin[col_i] = pd.qcut(dataset[col_i],q=5,duplicates='drop')  

        rst_df = pd.DataFrame()
        cnt = dataset_bin.groupby(col_i)[Y_Cols].count()
        sum_1 = dataset_bin.groupby(col_i)[Y_Cols].sum()
        rst_df = pd.concat((cnt,sum_1),axis=1)
        rst_df.columns = ['count','sum']
        rst_df['ratio'] = rst_df['sum']/rst_df['count']
        ratio_df_list.append(rst_df)

    for df_ratio_i in ratio_df_list:

        labels = [str(i) for i in df_ratio_i.index]
        fig, ax = plt.subplots()
        ax.bar(labels, df_ratio_i[df_ratio_i.columns[0]],label='ALL')
        ax.bar(labels, df_ratio_i[df_ratio_i.columns[1]],label=f"{Y_Cols}")

        ax.set_ylabel(f"Count All and {Y_Cols}",color="black",fontsize=14)
        ax.set_xlabel(df_ratio_i.index.name)
        ax.set_xticks(labels)

        ax2=ax.twinx()
        ax2.plot(labels,df_ratio_i[df_ratio_i.columns[2]],color="blue",marker="o")
        ax2.set_ylabel("Ratio",color="blue",fontsize=14)

        ylim1 = ax.get_ylim()
        len1 = ylim1[1]-ylim1[0]
        yticks1 = ax.get_yticks()
        rel_dist = [(y-ylim1[0])/len1 for y in yticks1]
        ylim2 = ax2.get_ylim()
        len2 = ylim2[1]-ylim2[0]
        yticks2 = [ry*len2+ylim2[0] for ry in rel_dist]
        ax2.set_yticks(yticks2)
        ax2.set_ylim(ylim2)  #<-- this line is needed to re-adjust the limits to the original values

        # Just add a title and rotate the x-axis labels to be horizontal.
        plt.title(f'Counts and Ratio for {df_ratio_i.index.name}')
        plt.xticks(rotation=90, ha='center')
        plt.show()

## Feature Importance - Tree|MDI based Classifica Feature Imp. VS. permutation Feature Imp.
def FI(model,dataset,X_Cols,Y_Cols):
    X_names = np.array(X_Cols)
    
    fig = plt.figure()
    try:
        model_FI = model.feature_importances_   #tuned_model
        sorted_idx = model_FI.argsort()
        y_ticks = np.arange(0, len(X_names))
        ax = plt.subplot(121)
        ax.barh(y_ticks, model_FI[sorted_idx])
        ax.set_yticks(y_ticks)
        ax.set_yticklabels(X_names[sorted_idx])
        ax.set_title("Feature Importances (MDI:Mean Decrease in Impurity")
    except:
        print("FI failed")
        pass

    from sklearn.inspection import permutation_importance

    try:
        result = permutation_importance(
            model,
            dataset[dataset.isna().sum(axis=1) == 0][X_names],
            dataset[dataset.isna().sum(axis=1) == 0][Y_Cols],
            n_repeats=10,
            random_state=42,
            n_jobs=2)
        sorted_idx = result.importances_mean.argsort()

        ax = plt.subplot(122)
        ax.boxplot(result.importances[sorted_idx].T,
                   vert=False,
                   labels=X_names[sorted_idx])
        ax.set_title("Permutation Importances (train set)")
    except:
        print("PFI failed")
        pass
    fig.tight_layout()
    fig.show()        
In [3]:
train = pd.read_csv('./input/titanic/train.csv')
test = pd.read_csv('./input/titanic/test.csv')
sub = pd.read_csv('./input/titanic/gender_submission.csv')
In [4]:
# add random variable to interpret Feature Importance
rng = np.random.RandomState(seed=42)

train["random_cat"] = rng.randint(3, size=train.shape[0])
train["random_num"] = rng.randn(train.shape[0])

test["random_cat"] = rng.randint(3, size=test.shape[0])
test["random_num"] = rng.randn(test.shape[0])

Check Data

In [5]:
qgrid.show_grid(train, grid_options={'maxVisibleRows': 5})
In [6]:
import sweetviz as sv
comparison_report = sv.compare([train,'Train'], [test,'Test'], target_feat='Survived')
comparison_report.show_notebook(scale=0.8)
In [11]:
X_Cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X_Cols_categ = ['Sex', 'Pclass']
X_Cols_Rand = ['random_cat','random_num']
Y_Cols = ['Survived']

dataset0 = train[X_Cols + X_Cols_Rand + Y_Cols]

dataset = dataset0.copy()

cols_float = dataset0.dtypes[dataset0.dtypes != float].index
cols_object = dataset0.dtypes[dataset0.dtypes == object].index

dataset.Sex = dataset.Sex.replace({'male': 0, 'female': 1})
dataset.Embarked = dataset.Embarked.replace({'S': 0, 'Q': 1, 'C': 2})

for col_i in dataset.columns:
    if col_i in cols_float:
        dataset[col_i] = dataset[col_i].astype(float)

qgrid.show_grid(dataset, grid_options={'maxVisibleRows': 5})
In [8]:
intra_report = sv.compare_intra(train, train["Sex"] == 'male', ["Male", "Female"], 'Survived')
intra_report.show_notebook(w=900, h=450, scale=0.8)
In [9]:
sns.pairplot(dataset, hue='Sex',corner=True,
               palette='husl',kind='reg',diag_kws={'bw': 0.2})
Out[9]:
<seaborn.axisgrid.PairGrid at 0x27dae5dcc70>

SetUp for Modeling

In [10]:
# One-Hot Encoded
'''
clf1 = setup(data = train, 
             #test_data = test,
             #normalize = True,
             target = 'Survived',
             remove_outliers = True,
             numeric_imputation = 'mean',
             categorical_features = ['Sex','Embarked'], 
             ignore_features = ['Name','Ticket','Cabin'],
             fold_shuffle=True, session_id=2,
             imputation_type='iterative',
             silent = True)

'''

#Categorical values are well ordered using integer
clf1 = setup(data = dataset, 
             #test_data = test,
             #normalize = True,
             target = 'Survived',
             remove_outliers = True,
             numeric_imputation = 'mean',
             #fold_shuffle=True, session_id=2,
             imputation_type='iterative',
             silent = True)
  Description Value
0 session_id 7668
1 Target Survived
2 Target Type Binary
3 Label Encoded 0.0: 0, 1.0: 1
4 Original Data (891, 10)
5 Missing Values True
6 Numeric Features 7
7 Categorical Features 2
8 Ordinal Features False
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (591, 11)
12 Transformed Test Set (268, 11)
13 Shuffle Train-Test True
14 Stratify Train-Test False
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI a036
22 Imputation Type iterative
23 Iterative Imputation Iteration 5
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model Light Gradient Boosting Machine
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model Light Gradient Boosting Machine
28 Unknown Categoricals Handling least_frequent
29 Normalize False
30 Normalize Method None
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance False
37 Combine Rare Levels False
38 Rare Level Threshold None
39 Numeric Binning False
40 Remove Outliers True
41 Outliers Threshold 0.050000
42 Remove Multicollinearity False
43 Multicollinearity Threshold None
44 Clustering False
45 Clustering Iteration None
46 Polynomial Features False
47 Polynomial Degree None
48 Trignometry Features False
49 Polynomial Threshold None
50 Group Features False
51 Feature Selection False
52 Features Selection Threshold None
53 Feature Interaction False
54 Feature Ratio False
55 Interaction Threshold None
56 Fix Imbalance False
57 Fix Imbalance Method SMOTE

Select Model (Compare/Select/Tune)

Compare ML algorithm

In [16]:
model_top18 = compare_models(n_select=18,exclude=['gpc'],turbo=False)
model_top3 = model_top18[:3]
  Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
catboost CatBoost Classifier 0.8359 0.8861 0.7556 0.8221 0.7840 0.6523 0.6573 0.5400
rf Random Forest Classifier 0.8275 0.8851 0.7431 0.8114 0.7723 0.6343 0.6392 0.0680
et Extra Trees Classifier 0.8241 0.8650 0.7259 0.8143 0.7638 0.6250 0.6309 0.0610
gbc Gradient Boosting Classifier 0.8088 0.8759 0.7259 0.7810 0.7486 0.5952 0.6000 0.0380
lightgbm Light Gradient Boosting Machine 0.8055 0.8784 0.7348 0.7714 0.7478 0.5904 0.5958 0.0530
mlp MLP Classifier 0.8054 0.8755 0.7422 0.7700 0.7500 0.5915 0.5979 0.2630
ridge Ridge Classifier 0.7952 0.0000 0.7167 0.7553 0.7323 0.5671 0.5708 0.0070
lda Linear Discriminant Analysis 0.7952 0.8648 0.7167 0.7553 0.7323 0.5671 0.5708 0.0080
lr Logistic Regression 0.7935 0.8627 0.7038 0.7591 0.7266 0.5617 0.5666 1.5460
xgboost Extreme Gradient Boosting 0.7919 0.8597 0.7087 0.7588 0.7270 0.5601 0.5668 0.1250
ada Ada Boost Classifier 0.7919 0.8526 0.7293 0.7474 0.7334 0.5632 0.5685 0.0280
nb Naive Bayes 0.7783 0.8358 0.6866 0.7422 0.7064 0.5299 0.5379 0.0070
dt Decision Tree Classifier 0.7633 0.7509 0.6913 0.7104 0.6977 0.5038 0.5066 0.0070
svm SVM - Linear Kernel 0.7529 0.0000 0.7732 0.6855 0.7110 0.5006 0.5207 0.0080
knn K Neighbors Classifier 0.7344 0.7560 0.5826 0.7002 0.6296 0.4264 0.4347 0.0150
rbfsvm SVM - Radial Kernel 0.7192 0.7767 0.5745 0.6743 0.6154 0.3970 0.4036 0.0170
qda Quadratic Discriminant Analysis 0.6855 0.6953 0.5460 0.6757 0.5418 0.3285 0.3562 0.0110

Ensemble Model create

In [17]:
blended_l = blend_models(estimator_list = model_top3, fold = 5, optimize = 'AUC')
  Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.8235 0.8904 0.8085 0.7600 0.7835 0.6348 0.6357
1 0.8136 0.8601 0.7447 0.7778 0.7609 0.6082 0.6086
2 0.8220 0.8714 0.7447 0.7955 0.7692 0.6247 0.6256
3 0.8475 0.9079 0.7174 0.8684 0.7857 0.6690 0.6763
4 0.8220 0.8774 0.6957 0.8205 0.7529 0.6153 0.6205
Mean 0.8257 0.8814 0.7422 0.8044 0.7705 0.6304 0.6333
SD 0.0114 0.0165 0.0379 0.0377 0.0127 0.0213 0.0232

Stacking Model create

In [18]:
stacked_I = stack_models(estimator_list = model_top3, fold = 5, optimize = 'AUC')
  Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.8151 0.8827 0.7872 0.7551 0.7708 0.6160 0.6164
1 0.8220 0.8625 0.7447 0.7955 0.7692 0.6247 0.6256
2 0.8475 0.8693 0.7660 0.8372 0.8000 0.6771 0.6789
3 0.7966 0.8816 0.6739 0.7750 0.7209 0.5622 0.5656
4 0.8305 0.8735 0.7174 0.8250 0.7674 0.6351 0.6390
Mean 0.8223 0.8739 0.7378 0.7976 0.7657 0.6230 0.6251
SD 0.0168 0.0076 0.0395 0.0305 0.0254 0.0370 0.0366

selected Model Tune

In [19]:
#model  = create_model('lightgbm')
model  = model_top3[0]
#model  = model_top3[1]
In [20]:
tuned_model = tune_model(model)
  Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.8167 0.9190 0.8333 0.7407 0.7843 0.6259 0.6291
1 0.8136 0.8571 0.7083 0.8095 0.7556 0.6060 0.6095
2 0.7966 0.8512 0.5833 0.8750 0.7000 0.5553 0.5814
3 0.8814 0.9046 0.8696 0.8333 0.8511 0.7525 0.7530
4 0.8136 0.8587 0.7826 0.7500 0.7660 0.6111 0.6115
5 0.8475 0.8684 0.6957 0.8889 0.7805 0.6662 0.6780
6 0.8814 0.8889 0.7826 0.9000 0.8372 0.7446 0.7491
7 0.8644 0.9384 0.7391 0.8947 0.8095 0.7057 0.7135
8 0.7797 0.8502 0.7391 0.7083 0.7234 0.5404 0.5408
9 0.8644 0.8804 0.7826 0.8571 0.8182 0.7104 0.7123
Mean 0.8359 0.8817 0.7516 0.8258 0.7826 0.6518 0.6578
SD 0.0346 0.0290 0.0756 0.0669 0.0458 0.0717 0.0698
In [21]:
tuned_model_df = pull(tuned_model)

Evaluate Model

In [22]:
evaluate_model(tuned_model)

Interpret Model Results

Modelless : Correlation and Similarity among X-Y

In [23]:
sns.set(font_scale=1.0)
corr = dataset.corr()

g = sns.clustermap(data=corr,
                   square=True,
                   annot=True,
                   cbar=True,
                   center=0,
                   vmin=-1,
                   vmax=1,
                   figsize=(8, 6),
                   annot_kws={"size": 8},
                   cmap="coolwarm")
  • 변수 설명
    • survived : 생존=1, 죽음=0
    • pclass : 승객 등급. 1등급=1, 2등급=2, 3등급=3
    • sibsp : 함께 탑승한 형제 또는 배우자 수
    • parch : 함께 탑승한 부모 또는 자녀 수
    • ticket : 티켓 번호
    • cabin : 선실 번호
    • embarked : 탑승장소 S=Southhampton, C=Cherbourg, Q=Queenstown

  • We can easily see X-Y similarity using correlation matrix and dendrogram

    • Y 인자 ("Survived") 기준, Best Correlation "Gender", "SibSp", "P_Class" 순

      • 단순히 다른변수들과의 관계성이 존재하는 상황에서 여성의 경우, 부모/자녀가 있는 경우, 요금이 비싼 경우 생존확률 높음을 의미
    • X 인자들 기준

      • "Gender", "Fare", "Embarked" show relative good similarity (sign for correlation with the others)
      • "SibSp", "Parch" show ralative good similarity
        • SibSp, Parch 는 상관계수 0.4 수준의 중간 정도로 대가족이 탑승하는 경우 형제/배우자, 부모/자녀가 함께 타는 경향으로 해석
        • 형제/배우자 혹은 부모/자녀든 가족이 탑승하는 경우 티켓등급, 요금, 탑승장소가 유사한 경우가 존재함.

Feature Importance

  • Feature Importance (Model-Specific Global)
  • Permutation Feature Importance (Model-Agnostic Global)
  • 경우에 따라 MDI 방식의 Feature Imporatance는 Dummy로 사용하는 Random Variable 이 가장 최상의 Feature Importance를 갖게 되는 경우가 많음.
    • 연속형 변수와 high-cardinality 범주형 변수에 대해 편향됨
    • Training Data만 사용하므로, 모델 자체가 random 변수에 overfit 되는 문제

  • 아래 결과에는 두 경우 모두 Gender > Pclass(Fare) ~ Age 가 globaly 모델의 Y값에 영향을 크게 주는 순서로 나타남
  • Embarked 나 SibSP(Parch)의 경우 random variable 보다 낮은 순위를 나타내고 있어 효과는 적다할 수 있겠음.
    • 단, Biz.Domain 상 가족이 많은 경우 가족 중 여성 및 어린아이들을 최우선적으로 살리려는 노력이 있어 무시할 수 없어 생존율 증가의 동기가 된다고 할 수 있는 경우, 함부로 의미없다라고 얘기할 수 없는 것이 실질적인 분석환경이겠음
In [24]:
train_data_using_model = pd.concat((get_config("X_train"),get_config("y_train")),axis=1)
FI(model_top3[0], train_data_using_model, get_config("X_train").columns, Y_Cols)
  • Stacked Model 이나 Ensemble 모델의 경우,
    • Permutation Feature Importance 는 Input 에 따른 반응을 지켜보며 결과를 반환
    • 기존의 Mean Decrease Impurity 방식의 Pre-defined Model Feature Importance 는 작동불가
In [94]:
FI(stacked_I, train_data_using_model, get_config("X_train").columns, Y_Cols)
FI failed

PDP and ICE

  • PDP : Partial Dependency Plot - (Model-Agnostic-Global)
  • ICE : Individual Conditional Expectation - (Model-Agnostic-Local)

Target and Actual Plot

  • Target Plot 과 대비하여 모델의 출력값의 Trend를 비교 [Global]
  • 전체적으로 1 dimentional view로 보았을 때 경향을 유지하도록 훈련된 것을 확인함
In [102]:
#target_plot (모델과 상관없이 사용 데이터) 대비 모델의 실제 결과를 체크
from pdpbox import pdp, get_dataset, info_plots

for col_i in get_config("X_train").columns:
    fig, axes, summary_df = info_plots.target_plot(df=train_data_using_model,
                                                   feature=col_i,
                                                   figsize=(6,4),
                                                   feature_name=col_i,
                                                   target=Y_Cols)
    fig, axes, summary_df = info_plots.actual_plot(model=tuned_model,
                                                   X=get_config("X_train"),
                                                   figsize=(6,4),
                                                   feature=col_i,
                                                   feature_name=col_i)

Partial Dependency Plot with Individual Conditional Expectation

PDP and ICE shows

  1. 동일 조건에서 좌석 등급이 높을수록 생존에 유리
  2. 동일 조건에서 10살 이상이면 나이는 생존율에 불리
  3. 동일 조건에서 적당히 높은 요금제의 경우 생존율 높고 그 이후 큰 차이 없음
  4. 여성이었다면 생존율에 유리
  5. 동반가족의 경우 ICE로 볼 때, Case by Case
In [31]:
for col_i in get_config("X_train").columns:
    pdp_i = pdp.pdp_isolate(model=tuned_model,
                            dataset=train_data_using_model,
                            model_features=get_config("X_train").columns,
                            feature=col_i)

    fig, axes = pdp.pdp_plot(
        pdp_isolate_out=pdp_i,
        plot_lines=True,  # True : ICE - individual curve, False : PDP with ICE range
        center=True,
        #plot_pts_dist=True,
        x_quantile=True,
        show_percentile=True,
        figsize=(8, 6),
        ncols=2,
        feature_name=col_i) 

Target and Actual Plot - 2D

  • 전반적인 경향을 학습한 상태를 확인
  • Age와 Fare가 독립적으로도 영향관계가 존재하는 것을 확인
    • (단순 correlation 때문에 어느 하나가 관계있어 다른 하나가 관계있다고 보이는것만이 아님)
In [32]:
# note datasize and interactions
fig, axes, summary_df = info_plots.target_plot_interact(
    df=train_data_using_model, features=['Age', 'Fare'], feature_names=['Age', 'Fare'], target=Y_Cols
)

fig, axes, summary_df = info_plots.actual_plot_interact(
    model = tuned_model, X=get_config("X_train"), features=['Age', 'Fare'], feature_names=['Age', 'Fare']
)
In [33]:
inter1 = pdp.pdp_interact(model=tuned_model,
                          dataset=get_config("X_train"),
                          model_features=get_config("X_train").columns,
                          features=['Age', 'Fare'])

fig, axes = pdp.pdp_interact_plot(pdp_interact_out=inter1,
                                  feature_names=['age', 'fare'],
                                  figsize=(8, 9),
                                  plot_type= 'grid', #'contour'
                                  x_quantile=True,
                                  plot_pdp=True)
In [34]:
# Note target_plot is traditional EDA view !! 
ratio_plot(dataset0,Y_Cols)

SHAP - SHapley Additive exPlanations

SHAP Feature Importance

  • Model-Agnostic-Global and Local
In [35]:
#interpret_model(tuned_model)
#interpret_model(tuned_model, plot='correlation',kwargs={vmin})
import shap

# for normal cases
explainer = shap.Explainer(tuned_model)
expected_value = explainer.expected_value
shap_values = explainer(get_config("X_train"))
In [35]:
expected_value
In [64]:
# summarize the effects of all the features
sns.set(font_scale=1.0)
plt.figure()

plt.subplot(1, 3, 1)
shap.plots.bar(shap_values, show=False)

plt.subplot(1, 3, 2)
shap.plots.beeswarm(shap_values, show=False, alpha=0.5)
plt.yticks([])

plt.subplot(1, 3, 3)
shap.decision_plot(explainer.expected_value,
                   shap_values.values,
                   get_config("X_train"),
                   link='logit',
                   show=False)
plt.yticks([])
plt.tight_layout()

#shap.summary_plot(shap_values, get_config("X_train"))

Global Feature 관점에서 PFI와 동일하게 Gender/P_Class, Fare/ Age 순서로 Y에 영향을 주는 형태로 나타나고 있다. Local 관점에서 Gender 에 따라 극명하게 결과가 나뉘는(분포가 분리되어 있는) 형태임을 볼 수 있고, Pclass, Fare 에 대해서도 꽤 분리도가 높게 나타난다. 즉, 개별 data 또한 Gender/Pclass/Fare 의 global 경향(방향성)을 따른다는 것을 알 수 있다.

SHAP Local Explanations

  • Note, for binary classification, logit (signal) will be 0~1 probability, waterfall curve show raw signal before logit function applying.
In [42]:
selection = 1
shap.initjs()
shap.force_plot(explainer.expected_value,
                shap_values.values[selection, :],
                get_config("X_train").iloc[selection, :],
                link='logit',
                #matplotlib=True,
                show=False)
Out[42]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [43]:
plt.figure(figsize=(8,6))
plt.subplot(1, 2, 1)
shap.decision_plot(explainer.expected_value,
                   shap_values.values[selection, :],
                   get_config("X_train").iloc[selection, :],
                   link='logit',
                   show=False)
plt.yticks([])

plt.subplot(1, 2, 2)
shap.plots.waterfall(shap_values[selection], show=False)

plt.tight_layout()

아래 interprete_model 을 통해 3가지 큰 부류확인

  • 여성이어서 살아남은 부류 : 생존율 가장 높은 그룹
  • 남성이지만,비싼 요금의 표와 1등 class 탑승객 : 2번째 생존율 높은 그룹
  • 그 외 : 생존율 낮은 그룹

SHAP Local/Global Explanations

In [44]:
# visualize all the training set predictions
# shap.plots.force(shap_values)
interpret_model(tuned_model, plot='reason')
Out[44]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
 

SHAPly values VS. features per instance

모델은 아래와 같이 학습되었음 (실제 현상이 그런것이 아니라 모델이 그렇다는 것)

  • 좌석등급이 높을수록(3-->2-->1), Y값을 증가시키는 방향 우세 (좌석등급 최하 3에서는 여성이 더 생존율이 낮음을 유의)
  • 10살 미만에서 대부분 Y값의 기여가 양수이며 이 때 남여 영향관계가 보이지 않음. 그 이상 나이대에서는 여성의 생존율 높음
  • 부모/자식 수직가족관계에서 혼자인 경우 여성의 생존율이 높음
  • 요금 50 이상에서는 생존에 매우 유리
In [59]:
# SHAP - Corrleation
for col_i in get_config("X_train").columns:
    shap.plots.scatter(shap_values[:, col_i], color=shap_values, alpha=0.5)
In [52]:
shap_values.shape
Out[52]:
(591, 11)
In [ ]:
# visualize all the training set predictions
#shap.force_plot(explainer.expected_value,shap_values)
#shap.force_plot(explainer.expected_value, shap_values, get_config("X_train"))
In [ ]:
interpret_model(tuned_model,plot='correlation',feature=get_config('X_train').columns[0])
In [ ]:
# sklearn partial_dependency
'''
import matplotlib.pyplot as plt
#from sklearn.inspection import partial_dependence
#from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence

plot_partial_dependence(model_top3[2], X=get_config('X_train'), 
                        features=get_config('X_train').columns[1:2],
                        grid_resolution=20)
'''
In [114]:
# Simple PDP and ICE check
ref_data = get_config('X_train').copy()
ref_data = ref_data.reset_index(drop=True).reset_index()
ref_data_expand = pd.DataFrame()
target_col_i = ref_data.columns[2]

Xs_grid = np.linspace(ref_data[target_col_i].min(),
                      ref_data[target_col_i].max(),
                      num=20)

for grid_i in Xs_grid:
    ref_data[target_col_i] = grid_i
    ref_data_expand = pd.concat((ref_data_expand, ref_data), axis=0)

ref_data_expand['Y_hat'] = model_top3[0].predict_proba(ref_data_expand)[:, 1]


plt.figure(figsize=(6,4))
ax = plt.subplot(1, 1, 1)
ss = pd.cut(ref_data_expand['Age'], 20)
s = [(i.left + i.right) / 2 for i in ss.values]
ref_data_expand['Age'] = s

# PDP
ref_data_expand.groupby('Age')['Y_hat'].mean().plot(xlim=[0, 80],
                                                    ylim=[0.30, 0.70],
                                                    lw=5,
                                                    c='blue',
                                                    ax=ax)
# ICE
ref_data_expand.pivot_table(index='Age', columns='index',
                            values='Y_hat').plot(xlim=[0, 80],
                                                 ylim=[0, 1],
                                                 legend=[],
                                                 c='green',
                                                 lw=0.2,
                                                 ax=ax)
Out[114]:
<AxesSubplot:xlabel='Age'>
In [ ]:
shap_interaction_values  = explainer.shap_interaction_values(get_config('X_train'))
sns.heatmap(shap_interaction_values)
'''
explainer = shap.TreeExplainer(tuned_model)
explainer.shap_values(get_config('X_train')) # <- it always works 
#shap_interaction_values = explainer.shap_interaction_values(get_config('X_train'))
'''
In [ ]:
help(get_config())
In [ ]: